In [36]:
from __future__ import print_function
import numpy as np
import pandas as pd
from collections import OrderedDict #sorting participant df dict before pd.concat()
import matplotlib.pylab as plt
%matplotlib inline
pd.options.display.mpl_style = 'default'
# Participants that are excluded from all performance analysis
non_english_fluent = ['023', '031', '045', '050', '070', '106',]
left_handed = ['042', '088',]
excluded_all_tasks = non_english_fluent + left_handed
# Pilot subjects
excluded_all_tasks += ['010', '011', '012', '013', '014']
In [37]:
def isolate_isip_task(taskname):
#requires external dbase variable
db_to_isip = dbase.swaplevel(1,2).swaplevel(0,1).xs('tap_r', drop_level=True)
del db_to_isip['channel']
del db_to_isip['pitch']
#channel==1 and pitch==48 verified for all tap_r data points
db_allsubs = db_to_isip.xs(taskname, drop_level=True)
return db_allsubs
In [38]:
def run_isip_grouping_filter(df,
minimum_interval,
#filter_stdev_radius,
maximum_interval,
start_recording):
grouped = df.groupby(level='pid') #, 'stamp_type'])
def filter_intervals(df):
filt1 = df.copy()
# remove timestamps prior to the end of stimuli
filt1 = filt1.loc[filt1.task_ms >= start_recording]
# remove timestamps that end an undersized interval
filt1 = filt1.loc[ (filt1.int_raw >= minimum_interval)
| (filt1.int_raw.isnull()) ]
# special case: p048 @800ms, outlier short intervals
# among >1000 average interval
# (removed, not needed: p048 was left-handed, eliminated)
#if filt1.int_raw.median() > 1000:
# print(filt1.int_raw.count())
# print("applied")
# filt1 = filt1.loc[ (filt1.int_raw >= 650)
# | (filt1.int_raw.isnull()) ]
# print(filt1.int_raw.count())
# special case: p049 @500ms, outlier short intervals
# among >1000 average interval
if filt1.int_raw.median() < 375:
print(filt1.int_raw.count())
print("applied")
filt1 = filt1.loc[ (filt1.int_raw <= 625)
| (filt1.int_raw.isnull()) ]
print(filt1.int_raw.count())
# end special cases
# interval recalculation now skips the too-soon taps.
int_filt1 = filt1.task_ms - filt1.task_ms.shift(1)
int_max_exceeded = int_filt1 > maximum_interval
# df.int_filt1 is our nearly-final intervals sequence: but has NaNs where
# intervals were combined, and contains overlong intervals for now.
# Make a pair of series containing the same indexes: only those
# that weren't removed due to too-SHORT intervals. (those we combined
# together because they were too short are 'legitimate' and included
# as an interval in the lagging. Those we excluded because they were
# overlong disrupt the lagged-comparison process and should trigger
# an exclusion of data from the lagged deviation calculation.)
# Or more succinctly: avoid calculating a lag-2 dev before and after
# a skipped, overlong interval.
intsequence = int_filt1[int_filt1.notnull()]
maxex_sequence = int_max_exceeded[int_filt1.notnull()]
intsequence[maxex_sequence == True] = np.nan
df['ints_filtered'] = intsequence
def skip_missing_lagvalues(series, lag_values, min_values='all'):
for n in lag_values:
series = series[maxex_sequence.shift(n) != True]
return series
lag2dev = intsequence - intsequence.shift(2)
lag2dev = skip_missing_lagvalues(lag2dev, [0, 1, 2])
df['lag2dev'] = lag2dev
df['lag2devsq'] = lag2dev ** 2
# rolliing_mean(): "By default, the result is set to the right edge of the
# window. This can be changed to the center of the window
# by setting center=True."
# shift(1) sets the result series one ahead of the end of the window, so that
# each value is compared with the mean of the N intervals preceding it.
movingmean_prev2 = pd.rolling_mean(intsequence.shift(1), window=2)
movingmean_prev2 = skip_missing_lagvalues(movingmean_prev2, [1, 2])
df['movingmean_prev2'] = movingmean_prev2
lagdev_avgprev2 = intsequence - pd.rolling_mean(intsequence.shift(1), window=2)
lagdev_avgprev2 = skip_missing_lagvalues(lagdev_avgprev2, [0, 1, 2])
df['lagdev_avgprev2'] = lagdev_avgprev2
df['lagdev_avgprev2sq'] = lagdev_avgprev2 ** 2
movingmean_prev3 = pd.rolling_mean(intsequence.shift(1), window=3)
movingmean_prev3 = skip_missing_lagvalues(movingmean_prev3, [1, 2, 3])
df['movingmean_prev3'] = movingmean_prev3
lagdev_avgprev3 = intsequence - pd.rolling_mean(intsequence.shift(1), window=3)
#lagdev_avgprev3 = skip_missing_lagvalues(lagdev_avgprev3, [0, 1, 2, 3])
df['lagdev_avgprev3'] = lagdev_avgprev3
df['lagdev_avgprev3sq'] = lagdev_avgprev3 ** 2
movingmean_prev4 = pd.rolling_mean(intsequence.shift(1), window=4)
movingmean_prev4 = skip_missing_lagvalues(movingmean_prev4, [1, 2, 3, 4])
df['movingmean_prev4'] = movingmean_prev4
lagdev_avgprev4 = intsequence - pd.rolling_mean(intsequence.shift(1), window=4)
lagdev_avgprev4 = skip_missing_lagvalues(lagdev_avgprev4, [0, 1, 2, 3, 4])
df['lagdev_avgprev4'] = lagdev_avgprev4
df['lagdev_avgprev4sq'] = lagdev_avgprev4 ** 2
df['LD_AP4_MMskipmis_var'] = movingmean_prev4.var(ddof=1)
df['LD_AP4_MMskipmis_std'] = movingmean_prev4.std(ddof=1)
df['LD_AP4_MMskipmis_len'] = movingmean_prev4.count()
temp_noskip_4 = pd.rolling_mean(intsequence.shift(1), window=4)
df['LD_AP4_MMkeepmis_var'] = temp_noskip_4.var(ddof=1)
df['LD_AP4_MMkeepmis_std'] = temp_noskip_4.std(ddof=1)
df['LD_AP4_MMkeepmis_len'] = temp_noskip_4.count()
movingmean_prev_12 = pd.rolling_mean(intsequence.shift(1), window=12)
#movingmean_prev_12 = skip_missing_lagvalues(movingmean_prev_12, range(1, 12 + 1))
df['movingmean_prev_12'] = movingmean_prev_12
lagdev_avgprev_12 = intsequence - pd.rolling_mean(intsequence.shift(1), window=12)
#lagdev_avgprev_12 = skip_missing_lagvalues(lagdev_avgprev_12, range(0, 12 + 1))
df['lagdev_avgprev_12'] = lagdev_avgprev_12
df['lagdev_avgprev_12sq'] = lagdev_avgprev_12 ** 2
df['int_filt1'] = int_filt1
df['int_max_exceeded'] = int_max_exceeded
df['ints'] = int_filt1.loc[~int_max_exceeded]
return df
df = grouped.apply(filter_intervals)
#print(df)
return df
(Image from a related article. "34" is inapplicable for the current dataset.)
In [39]:
def isip_outcomes_taskdf(isip_db,
squared_local_dev_measure='lag2devsq'):
pid_list = sorted(list(isip_db.index.get_level_values('pid').unique()))
df = pd.DataFrame(index=pid_list)
df.index.names = ['pid']
ints = isip_db.ints.groupby(level='pid')
ints_count = ints.apply(lambda s: s.count()) #count() ignores nulls
ints_mean = ints.apply(lambda s: s.mean())
ints_variance = ints.apply(lambda s: s.var(ddof=1))
ints_stdev = ints.apply(lambda s: s.std(ddof=1))
ints_lag2corr = ints.corr(ints.shift(2))
lagdevsq_series = isip_db[squared_local_dev_measure]
lagdevsq = lagdevsq_series.groupby(level='pid')
lagdevsq_count = lagdevsq.apply(lambda s: s.count()) # (N - 2)
lagdevsq_mean = lagdevsq.apply(lambda s: s.mean())
#Sum: ((X sub i + 2) - (x sub i)) ^ 2
lagdevsq_sum = lagdevsq.apply(lambda s: s.sum())
#because of problem below, might need to change the lag2devsq_count variable
# to be the overall count (ints_count) minus one.
local_sq_abs = lagdevsq_sum / (2. * lagdevsq_count)
local = 100 * (1. / ints_mean) * np.sqrt(local_sq_abs)
#PROBLEM: total variance uses all the intervals. local_sq_abs removes data points
#when they aren't sequential...
drift = 100 * ((1. / ints_mean) * np.sqrt(ints_variance - local_sq_abs))
df['ints_count'] = ints_count
df['ints_mean'] = ints_mean
df['ints_variance'] = ints_variance
df['ints_stdev'] = ints_stdev
df['ints_lag2corr'] = ints_lag2corr
#df['devsq_sum'] = lagdevsq_sum
#df['devsq_count'] = lagdevsq_count
#df['devsq_mean'] = lagdevsq_mean
#df['local_sq_abs'] = local_sq_abs
#df['local'] = local
#df['drift'] = drift
df[squared_local_dev_measure + '_sum'] = lagdevsq_sum
df[squared_local_dev_measure + '_count'] = lagdevsq_count
df[squared_local_dev_measure + '_mean'] = lagdevsq_mean
df[squared_local_dev_measure + '_local_sq_abs'] = local_sq_abs
df[squared_local_dev_measure + '_local'] = local
df[squared_local_dev_measure + '_drift'] = drift
return df
In [40]:
#recording of intervals starts (ISI x 5) after stims end (2.5s, 4.0s)
ISIP_5_ENDSTIMS_MS = 19500
ISIP_5_WAIT_AFTER_STIMS_MS = 2500
ISIP_5_MINIMUM_INT = 375
ISIP_5_MAXIMUM_INT = 650
isip_5_start_recording = ISIP_5_ENDSTIMS_MS + ISIP_5_WAIT_AFTER_STIMS_MS
#ISIP_500_DISQUALIFIED = ['045', #did not complete task correctly
# '042', '048' #left-handed
# ]
ISIP_8_ENDSTIMS_MS = 23200
ISIP_8_WAIT_AFTER_STIMS_MS = 4000
ISIP_8_MINIMUM_INT = 600
ISIP_8_MAXIMUM_INT = 1000
isip_8_start_recording = ISIP_8_ENDSTIMS_MS + ISIP_8_WAIT_AFTER_STIMS_MS
#ISIP_800_DISQUALIFIED = ['045', #did not complete task correctly
# '042', '048' #left-handed
# ]
pickled_dbase = "c:/db_pickles/pickle - dbase - 2014-10-03b.pickle"
dbase = pd.read_pickle(pickled_dbase)
In [41]:
dbase = dbase.drop(excluded_all_tasks, level='pid')
In [42]:
db_isip5_allsubs = isolate_isip_task('ISIP_5')
db_isip8_allsubs = isolate_isip_task('ISIP_8')
db_isip5 = db_isip5_allsubs
db_isip8 = db_isip8_allsubs
#db_isip5 = db_isip5_allsubs.drop(ISIP_500_DISQUALIFIED)
#db_isip8 = db_isip8_allsubs.drop(ISIP_800_DISQUALIFIED)
#delete_columns_for_filter_debug_rerun(db_isip5)
print('isip5:')
db_isip5 = run_isip_grouping_filter(db_isip5,
minimum_interval = ISIP_5_MINIMUM_INT,
#filter_stdev_radius = ISIP_5_FILTER_STDEV_RADIUS,
maximum_interval = ISIP_5_MAXIMUM_INT,
start_recording = isip_5_start_recording)
print('isip8:')
#delete_columns_for_filter_debug_rerun(db_isip8)
db_isip8 = run_isip_grouping_filter(db_isip8,
minimum_interval = ISIP_8_MINIMUM_INT,
#filter_stdev_radius = ISIP_8_FILTER_STDEV_RADIUS,
maximum_interval = ISIP_8_MAXIMUM_INT,
start_recording = isip_8_start_recording)
In [44]:
pid_list_800 = sorted(list(db_isip8.index.get_level_values('pid').unique()))
pid_list_500 = sorted(list(db_isip5.index.get_level_values('pid').unique()))
print('\n\n800')
for pid in pid_list_800:
print(pid, end=",")
assert db_isip8.ints_filtered.xs(pid).max() <= 1000
assert db_isip8.ints_filtered.xs(pid).min() >= 600
print(db_isip8.ints_filtered.xs(pid).count())
print('\n\n500')
for pid in pid_list_500:
print(pid, end=",")
assert db_isip5.ints_filtered.xs(pid).max() <= 650
assert db_isip5.ints_filtered.xs(pid).min() >= 375
print(db_isip5.ints_filtered.xs(pid).count())
In [33]:
db_isip8.ints_filtered.xs('015').hist()
Out[33]:
In [45]:
#excluded: number of qualifying intervals is far below normal
# only for #49 on 500ms task, and just for #48 on 800ms task.
print(db_isip5.ints_filtered.xs('049').count())
print(db_isip8.ints_filtered.xs('048').count())
db_isip5 = db_isip5.drop('049', level='pid')
db_isip8 = db_isip8.drop('048', level='pid')
In [47]:
#outcome_dfs_isip5 = {}
outcomesdf_isip5_lag2 = isip_outcomes_taskdf(db_isip5, 'lag2devsq')
#outcomesdf_isip5_avgprev2 = isip_outcomes_taskdf(db_isip5, 'lagdev_avgprev2sq')
#outcomesdf_isip5_avgprev3 = isip_outcomes_taskdf(db_isip5, 'lagdev_avgprev3sq')
outcomesdf_isip5_avgprev4 = isip_outcomes_taskdf(db_isip5, 'lagdev_avgprev4sq')
#outcomesdf_isip5_avgprev_12 = isip_outcomes_taskdf(db_isip5, 'lagdev_avgprev_12sq')
outcomesdf_isip8_lag2 = isip_outcomes_taskdf(db_isip8, 'lag2devsq')
#outcomesdf_isip8_avgprev2 = isip_outcomes_taskdf(db_isip8, 'lagdev_avgprev2sq')
#outcomesdf_isip8_avgprev3 = isip_outcomes_taskdf(db_isip8, 'lagdev_avgprev3sq')
outcomesdf_isip8_avgprev4 = isip_outcomes_taskdf(db_isip8, 'lagdev_avgprev4sq')
#outcomesdf_isip8_avgprev_12 = isip_outcomes_taskdf(db_isip8, 'lagdev_avgprev_12sq')
In [48]:
updated = "2014-10-12b"
db_isip5.to_csv('isip5_intervals - ' + updated + '.csv')
db_isip8.to_csv('isip8_intervals - ' + updated + '.csv')
outcomesdf_isip5_lag2.to_csv('dfo-isip5_lag2 - ' + updated + '.csv')
#outcomesdf_isip5_avgprev2.to_csv('dfo-isip5_avgprev2 - ' + updated + '.csv')
#outcomesdf_isip5_avgprev3.to_csv('dfo-isip5_avgprev3 - ' + updated + '.csv')
outcomesdf_isip5_avgprev4.to_csv('dfo-isip5_avgprev4 - ' + updated + '.csv')
#outcomesdf_isip5_avgprev_12.to_csv('dfo-isip5_avgprev_12 - ' + updated + '.csv')
outcomesdf_isip8_lag2.to_csv('dfo-isip8_lag2 - ' + updated + '.csv')
#outcomesdf_isip8_avgprev2.to_csv('dfo-isip8_avgprev2 - ' + updated + '.csv')
#outcomesdf_isip8_avgprev3.to_csv('dfo-isip8_avgprev3 - ' + updated + '.csv')
outcomesdf_isip8_avgprev4.to_csv('dfo-isip8_avgprev4 - ' + updated + '.csv')
#outcomesdf_isip8_avgprev_12.to_csv('dfo-isip8_avgprev_12 - ' + updated + '.csv')
prefix = "c:/db_pickles/pickle - "
outcomesdf_isip5_lag2.to_pickle(prefix + 'dfo-isip5_lag2 - ' + updated + '.pickle')
#outcomesdf_isip5_avgprev2.to_pickle(prefix + 'dfo-isip5_avgprev2 - ' + updated + '.pickle')
#outcomesdf_isip5_avgprev3.to_pickle(prefix + 'dfo-isip5_avgprev3 - ' + updated + '.pickle')
outcomesdf_isip5_avgprev4.to_pickle(prefix + 'dfo-isip5_avgprev4 - ' + updated + '.pickle')
#outcomesdf_isip5_avgprev_12.to_pickle(prefix + 'dfo-isip5_avgprev_12 - ' + updated + '.pickle')
outcomesdf_isip8_lag2.to_pickle(prefix + 'dfo-isip8_lag2 - ' + updated + '.pickle')
#outcomesdf_isip8_avgprev2.to_pickle(prefix + 'dfo-isip8_avgprev2 - ' + updated + '.pickle')
#outcomesdf_isip8_avgprev3.to_pickle(prefix + 'dfo-isip8_avgprev3 - ' + updated + '.pickle')
outcomesdf_isip8_avgprev4.to_pickle(prefix + 'dfo-isip8_avgprev4 - ' + updated + '.pickle')
#outcomesdf_isip8_avgprev_12.to_pickle(prefix + 'dfo-isip8_avgprev_12 - ' + updated + '.pickle')
In [50]:
#Confirming that the errors in drift calculations were due to lag-2 autocorrelations in the data.
def negative_autocorrelations_where_drift_calc_errors_occurred(outcomesdf):
df = outcomesdf
bycorr = df.ints_lag2corr.sort(inplace=False, ascending=True)
ranks = bycorr.reset_index().reset_index()
ranks['ranknum'] = ranks['index'] + 1
ranks = ranks.set_index('pid')
drifterrors = list(df[df.lag2devsq_drift.isnull()].index.values)
dranks = {d: (ranks.loc[d].ranknum, ranks.loc[d].ints_lag2corr) for d in drifterrors}
return dranks
[ 'isip500',
negative_autocorrelations_where_drift_calc_errors_occurred(outcomesdf_isip5_lag2),
'isip800',
negative_autocorrelations_where_drift_calc_errors_occurred(outcomesdf_isip8_lag2)
]
Out[50]:
In [57]:
def sideplots(title, serieslist, namelist, **kwargs):
assert len(serieslist) == len(namelist)
count = len(serieslist)
from matplotlib import pyplot as plt
fig, axes = plt.subplots(nrows=count, ncols=3, **kwargs)
#fig.set_figheight(10)
#fig.set_figwidth(15)
#fig.suptitle('t', fontsize=25)
#plt.xlabel('xlabel', fontsize=18)
#plt.ylabel('ylabel', fontsize=16)
plots = [(namelist[i], serieslist[i]) for i in range(count)]
for (i, (n, s)) in enumerate(plots):
ax_hist = plt.subplot2grid((count, 3), (i, 0), colspan=2)
ax_hist.set_title(n, fontsize=16)
ax_line = plt.subplot2grid((count, 3), (i, 2), colspan=1)
ax_line.set_title(n, fontsize=16)
s.plot(ax=ax_line, linewidth=3)
s.hist(ax=ax_hist, bins=20)
fig.suptitle(title, fontsize=22)
plt.show()
#fig.tight_layout()
In [58]:
pids5 = sorted(set((p) for p in db_isip5.index.get_level_values('pid')))
isip5 = {p: db_isip5.xs(p) for p in pids5}
def isip5_hist(p):
ints = isip5[p].int_raw
ints_filt = isip5[p].ints_filtered
#sideplots(ints, ints_filt,
# plotname_top="pre-filter",
# plotname_bottom="post-filter"):
#ints.hist()
#plt.show()
for p in pids5:
print(p)
isip5_hist(p)
#ri = raw_input()
#if ri=="x":
break
#isip5['015']
In [60]:
pids8 = sorted(set((p) for p in db_isip8.index.get_level_values('pid')))
isip8 = {p: db_isip8.xs(p) for p in pids8}
pids8_gen = (p for p in pids8)
def next_twenty_800():
ran = 0
for p in pids8_gen:
ran += 1
if ran==10: break
i_raw = isip8[p].int_raw
c = i_raw.count()
print(c)
m = i_raw.mean()
s = i_raw.std()
i_abs_cutoff = i_raw[(i_raw >= 600) & (i_raw <= 1000)]
i_sd_cutoff = i_raw[(i_raw <= m + 2.97*s) & (i_raw >= m - 2.97*s)]
sideplots(title='P. %s' % p,
serieslist=[i_raw, i_abs_cutoff, i_sd_cutoff],
namelist=['raw', 'abs cutoff', 'sd cutoff'])
In [74]:
next_twenty_800()
In [72]:
isip5['055'].ints_filtered.plot(marker="o")
Out[72]:
In [65]:
pids = (p for p in pids5)
def next_ten():
ran = 0
for p in pids:
p='055'
ran += 1
if ran==10: break
i_raw = isip5[p].int_raw
c = i_raw.count()
print(c)
m = i_raw.mean()
s = i_raw.std()
i_abs_cutoff = i_raw[(i_raw >= 375) & (i_raw <= 650)]
i_sd_cutoff = i_raw[(i_raw <= m + 2.97*s) & (i_raw >= m - 2.97*s)]
sideplots(title='P. %s' % p,
serieslist=[i_raw, i_abs_cutoff, i_sd_cutoff],
namelist=['raw', 'abs cutoff', 'sd cutoff'],
figsize=(15,8))
break
In [66]:
next_ten()
In [94]:
pids5 = sorted(set((p) for p in db_isip5.index.get_level_values('pid')))
isip5 = {p: db_isip5.xs(p) for p in pids5}
pids8 = sorted(set((p) for p in db_isip8.index.get_level_values('pid')))
isip8 = {p: db_isip8.xs(p) for p in pids8}
for p in pids8:
sideplots(title='P. %s' % p,
serieslist=[isip8[p].ints_filtered],
namelist=['ints_filtered'],
figsize=(19,5))
In [72]:
#df['drift'] = 100 * (1. / df.ints_mean) * np.sqrt(df.ints_variance - df.local_sq_abs)
#p017:
100 * (1. / 844.185541) * np.sqrt(1740.213950 - 1040.648841)
# problem: we have local_sq_abs values that are greater
# than the total ints_variance. That shouldn't be.
Out[72]:
In [19]:
db_isip5[db_isip5.task_ms >= isip_5_start_recording].to_csv('check_isip5_calcs_individual_pids_V4.csv')
outcomesdf_isip5.to_csv('check_isip5_calcs_outcomes_V4.csv')
In [20]:
db_isip5.xs('010', level='pid').ints.count()
Out[20]:
In [33]:
#db_isip8
test = outcomesdf_isip5_exp['exp_local_sq_abs'] - outcomesdf_isip5['local_sq_abs']
test.max()
Out[33]:
In [49]:
db_isip5.head().T
Out[49]:
In [104]:
#db_isip5[['task_ms', 'movingmean_prev4']].groupby(level='pid').plot(kind='scatter', x=0, y=1, figsize=(18,18))
db_isip5[['task_ms', 'movingmean_prev3']].plot(kind='scatter', x=0, y=1, figsize=(18,18))
Out[104]:
In [8]:
outcomesdf_isip8_avgprev4.sort('ints_variance', ascending=False)
#outcomesdf_isip5[30:]
Out[8]:
In [32]:
#don't need this: we'll get the full ID list when we merge them together later.
# set an index with full ID list to ensure that the index is consistent
# across all of the scales/index values/etc. that we'll be concatenating
# together once all processing steps are done.
#pid_list = ['010', '011', '012', '013', '014', '015', '016', '017',
# '018', '019', '020', '021', '022', '023', '024', '025',
# '026', '027', '028', '029', '030', '031', '032', '033',
# '034', '035', '036', '037', '038', '039', '040', '041',
# '042', '043', '044', '045', '046', '047', '048', '049',
# '050', '051', '052', '053', '054', '055', '056', '057',
# '058', '059', '060', '061', '062', '063', '064', '065',
# '066', '067', '068', '069', '070', '071', '072', '073',
# '074', '075', '076', '077', '078', '079', '080', '081',
# '082', '083', '084', '085', '086', '087', '088', '089',
# '090', '091', '092', '093', '094', '095', '096', '097',
# '098', '099', '100', '101', '102', '103', '104', '105',
#'106', '107', '108', '109', '110', '111', '112', '113',
#'114', '115', '116', '117', '118', '119', '120', '121']
#full_index = pd.DataFrame(index = pid_list)
In [31]:
#not using this anymore: it's easier to have all of the multiindex values
#(scales, isip5, isip8, etc) set up in the final assembly, rather than
#stick the ISIP5 / ISIP8 together in a multiindex already here. So we'll
#save the tasks under separate pickle files.
isip_out = pd.concat([scales,
isip_outcomes_taskdf(db_isip5),
isip_outcomes_taskdf(db_isip8),
],
axis=1, #defaults
join='outer',
#join_axes=None,
#ignore_index=False,
keys=['scales','isip5','isip8'],
#levels=None,
names=['task'],
#verify_integrity=False
)
isip_out[31:37]
Out[31]:
In [20]:
counts = {pid: str(db_isip5.xs(pid, level='pid').ints.count()) + " @500, "
+ str(db_isip8.xs(pid, level='pid').ints.count()) + " @800"
for pid in pid_list}
counts
Out[20]:
In [53]:
for pid in pid_list:
try:
plt.figure(figsize=(14,3))
db_isip8.loc[pid].ints.hist(bins=40)
print(pid)
plt.show()
except: pass
#outliers: 054, one point >1000
# 036, one point > 1000
# 056, two points > 900
# 062, oine point > 1100
# 048, two points < 650 (almost whole set is >1000)
In [14]:
for pid in pid_list:
try:
plt.figure(figsize=(14,3))
db_isip5.loc[pid].ints.hist(bins=40)
print(pid)
plt.show()
except: pass
#outliers: 054, one point >1000
# 036, one point > 1000
# 056, two points > 900
# 062, oine point > 1100
# 048, two points < 650 (almost whole set is >1000)
In [ ]:
dfo.to_csv('isip_data_output.csv')
!isip_data_output.csv
In [73]:
#for viewing data that will be kept...
data = dfo.loc[dfo.scales.exclusion_language==0]
In [28]:
#pd.set_option('display.multi_sparse', False)
#dfo['isip5']['ints_mean']
#dfo['isip5','ints_mean']
#dfo.loc(axis=1)['isip5','ints_mean']
#dfo.loc(axis=1)[('isip5','ints_mean')]
dfo['isip5'].drift[dfo['isip5'].drift > 6]
Out[28]:
In [42]:
dfo.head()
Out[42]:
In [44]:
dfo.xs('scales', level='task', axis=1).to_json('isip_data_output_scales.json')
dfo.xs('isip5', level='task', axis=1).to_json('isip_data_output_isip5.json')
dfo.xs('isip8', level='task', axis=1).to_json('isip_data_output_isip8.json')
In [17]:
outcomesdf_isip5
Out[17]:
In [11]:
x = outcomesdf_isip5['lag2devsq_mean']
y = outcomesdf_isip5_avgprev3['lagdev_avgprev3sq_mean']
#ids = sorted(set(x.index).intersection(set(y.index)))
#df = pd.DataFrame(index = ids)
sx = x.apply(lambda n: np.sqrt(n))
sy = y.apply(lambda n: np.sqrt(n))
dfo = pd.concat([sx, sy], axis=1, join='outer',
keys = ['x' + ' sqrt of ' + x.name,
'y' + ' sqrt of ' + y.name])
dfo.plot(x=0, y=1, kind='scatter', figsize=(12,12))
Out[11]:
In [22]:
plt.figure(figsize=(8,5))
plt.scatter(data.isip8.drift, data.isip5.drift)
plt.show()
print(data.isip8.drift[data.isip8.drift > 6])
print(data.isip5.drift[data.isip5.drift > 6])
data_rem_drift_ol = dfo.drop(['049', '055', '071', '073'])
plt.figure(figsize=(8,5))
plt.scatter(data_rem_drift_ol.isip8.drift, data_rem_drift_ol.isip5.drift)
plt.show()
In [23]:
data.scales.fsiq2
data.isip5.local
Out[23]:
In [24]:
plt.figure(figsize=(8,5))
plt.scatter(data.isip8.local, data.isip5.local)
plt.show()
print(data.isip8.drift[data.isip8.local > 9])
print(data.isip5.drift[data.isip5.local > 9])
data_rem_local_ol = dfo.drop(['049'])
plt.figure(figsize=(8,5))
plt.scatter(data_rem_local_ol.isip8.local, data_rem_local_ol.isip5.local)
plt.show()
In [25]:
plt.figure(figsize=(8,5))
plt.scatter(data.scales.wasivocab_rawscore, data.scales.wasimatrix_rawscore)
plt.show()
In [26]:
plt.figure(figsize=(8,5))
plt.scatter(data.scales.wasivocab_tscore, data.isip8.local)
plt.show()
In [27]:
compare_df = pd.concat([data.scales.wasivocab_tscore,
data.isip8.local],
axis=1)
compare_df = compare_df[compare_df.wasivocab_tscore.notnull()]
compare_df.loc['049'].local = 7
plt.figure(figsize=(8,5))
plt.scatter(compare_df.wasivocab_tscore, compare_df.local)
plt.show()
compare_arr = np.array(compare_df.T)
#compare_arr[0]
from scipy import stats
r, p = stats.pearsonr(compare_arr[0], compare_arr[1])
print("r: {r}, p: {p}".format(r=r, p=p))
In [28]:
compare_df = pd.concat([data.scales.wasimatrix_tscore,
data.isip8.local],
axis=1)
compare_df = compare_df[compare_df.wasimatrix_tscore.notnull()]
compare_df.loc['049'].local = 7
plt.figure(figsize=(8,5))
plt.scatter(compare_df.wasimatrix_tscore, compare_df.local)
plt.show()
compare_arr = np.array(compare_df.T)
#compare_arr[0]
from scipy import stats
r, p = stats.pearsonr(compare_arr[0], compare_arr[1])
print("r: {r}, p: {p}".format(r=r, p=p))
In [29]:
compare_df = pd.concat([data.scales.fsiq2,
data.isip8.local],
axis=1)
compare_df = compare_df[compare_df.fsiq2.notnull()]
compare_df.loc['049'].local = 7
plt.figure(figsize=(8,5))
plt.scatter(compare_df.fsiq2, compare_df.local)
plt.show()
compare_arr = np.array(compare_df.T)
#compare_arr[0]
from scipy import stats
r, p = stats.pearsonr(compare_arr[0], compare_arr[1])
print("r: {r}, p: {p}".format(r=r, p=p))
In [30]:
def rtest(df_columns):
from scipy import stats
import matplotlib.pylab as plt
print(df_columns.columns[0])
print(df_columns.columns[1])
plt.figure(figsize=(3,3))
plt.scatter(compare_df.fsiq2, compare_df.local)
plt.show()
rtest(compare_df)
In [31]:
compare_df = pd.concat([data.scales.fsiq2,
data.isip5.local],
axis=1)
compare_df = compare_df[compare_df.fsiq2.notnull()]
assert len(compare_df[compare_df.fsiq2.isnull()]) == 0
In [32]:
compare_arr = np.array(compare_df.T)
#compare_arr[0]
from scipy import stats
stats.pearsonr(compare_arr[0], compare_arr[1])
Out[32]:
In [175]:
def scatter_tooltips(df, x_col, y_col,
size_col=None,
color_col=None,
show_all_cols=False,
fig_size=(8, 5)):
#import matplotlib.pyplot as plt
#import numpy as np
import pandas as pd
import mpld3
from mpld3 import plugins
#x = df[x_col]
#y = df[y_col]
df_info = [x_col, y_col]
#for arg in args:
# df_info.append(arg)
# Define some CSS to control our custom labels
css = """
table { border-collapse: collapse; }
th { color: #ffffff; background-color: #000000; }
td { background-color: #cccccc; }
table, th, td { font-family:Arial, Helvetica, sans-serif;
border: 1px solid black; text-align: right; }
"""
fig, ax = plt.subplots()
fig.set_size_inches(fig_size)
ax.grid(True, alpha=0.3)
labels = []
for row in df.iterrows():
index, series = row
pid = index
label = pd.DataFrame(series)
labels.append(str(label.to_html()))
points = ax.plot(df[x_col],
df[y_col],
'o',
color='b',
markeredgecolor='k',
markersize=8,
markeredgewidth=1,
alpha=.6)
ax.set_xlabel(x_col)
ax.set_ylabel(y_col)
ax.set_title(x_col + ' . ' + y_col, size=16)
tooltip = plugins.PointHTMLTooltip(points[0], labels,
voffset=10, hoffset=10, css=css)
plugins.connect(fig, tooltip)
return mpld3.display()
In [176]:
scatter_tooltips(data, 'isip8_sq2dev_mean_sqrt', 'isip5_sq2dev_mean_sqrt',
fig_size=(12, 7.5))
Out[176]:
In [132]:
def d3plot(x, y, size=(10,6)):
import mpld3
fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'))
fig.set_size_inches((8,5) )
scatter = ax.scatter(x, y,
#c=np.random.random(size=N),
s=40, #size
alpha=0.5,
cmap=plt.cm.jet)
ax.grid(color='white', linestyle='solid')
ax.set_title("Scatter Plot (with tooltips!)", size=10)
labels = ['{0}'.format(pid) for pid in x.index]
tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
mpld3.plugins.connect(fig, tooltip)
return mpld3.display()
In [133]:
d3plot(data['isip5_sq2devsum'], data['isip8_sq2devsum'])
Out[133]:
In [26]:
sset = db_isip8.xs(['056', 'tap_r'], level=['pid', 'stamp_type'])
print(sset.ints.mean())
print(sset.ints.std())
In [18]:
for pid in pid_list:
try:
sset = db_isip5.xs([pid, 'tap_r'], level=['pid', 'stamp_type'])
data = sset.ints
if data.min() > 400 or data.max() < 600: continue
print(pid)
plt.figure(figsize=(13,6))
data.hist(bins=100)
#annotating non-midpoint values
caption_y_increment = 0.2
median = data.median()
prev_ypos = 0
for idx, value in enumerate(data):
if np.abs(value - median) > 150:
caption = str(data.index[idx]) + ": " + str(value.round(1))
plt.annotate(caption, (value, prev_ypos + caption_y_increment))
prev_ypos += caption_y_increment
plt.show()
except:
print("error....")
#High outliers:
#p049 - will need to remove manually
In [99]:
taps = db_isip5.xs('tap_r', level='stamp_type').ints
pmeans = taps.groupby(level='pid').mean()
data = pmeans
##############################
plt.figure(figsize=(13,6))
data.hist(bins=25)
#annotating non-midpoint values
caption_y_increment = 1
median = data.median()
prev_ypos = 0
for idx, value in enumerate(data):
if np.abs(value - median) > 25:
caption = str(data.index[idx]) + ": " + str(value.round(1))
plt.annotate(caption, (value, prev_ypos + caption_y_increment))
prev_ypos += caption_y_increment
plt.show()
#why is pid 049's mean so low?
In [100]:
sset = db_isip8.xs(['048', 'tap_r'], level=['pid', 'stamp_type'])
data = sset.ints.mean()
data - data*0.5
Out[100]:
In [79]:
sset[60:]
Out[79]:
In [65]:
sset = db_isip5.xs(['055', 'tap_r'], level=['pid', 'stamp_type'])
sset_ints = sset.ints
sset_ints.hist(bins=100, figsize=(14,5))
Out[65]:
In [98]:
taps = db_isip8.xs('tap_r', level='stamp_type').ints
pmeans = taps.groupby(level='pid').mean()
data = pmeans
##############################
plt.figure(figsize=(13,6))
data.hist(bins=25)
#annotating non-midpoint values
caption_y_increment = 0.8
median = data.median()
prev_ypos = 0
for idx, value in enumerate(data):
if np.abs(value - median) > 60:
caption = str(data.index[idx]) + ": " + str(value.round(1))
plt.annotate(caption, (value, prev_ypos + caption_y_increment))
prev_ypos += caption_y_increment
plt.show()
#Good distribution-- 048 is a bit high
In [60]:
ISIP5_DATASTART = 22000
sset = db_isip5.xs(['055', 'tap_r'], level=['pid', 'stamp_type'])
sset[sset.task_ms > ISIP5_DATASTART]
# a double-interval has snuck through here.
Out[60]:
In [39]:
In [45]:
taps = db_isip5.xs('tap_r', level='stamp_type').ints
pstdev = taps.groupby(level='pid').std()
data = pstdev
plt.figure(figsize=(13,6))
data.hist(bins=25)
#annotating non-midpoint values
caption_y_increment = 1
median = data.median()
prev_ypos = 0
for idx, value in enumerate(data):
if np.abs(value - median) > 8:
caption = str(data.index[idx]) + ": " + str(value.round(1))
plt.annotate(caption, (value, prev_ypos + caption_y_increment))
prev_ypos += caption_y_increment
plt.show()
In [93]:
lt = [1, 2, 3, 4, 5, 7]
r = [2, 3, 4]
set(lt).difference(set(r))
Ordered
Out[93]:
In [14]:
import matplotlib.pylab as plt
%matplotlib inline
pd.options.display.mpl_style = 'default'
In [14]:
#EXPECTED_RTAP_MINS = {}
In [15]:
dfo = pd.DataFrame(index = pid_list)
dfo['missing_tasks'] = ''
for t in TASK_NAMES_USING:
df = dbase.xs(t, drop_level=True) #index: pid, stamp_type, csv_line
for p in pid_list:
dfp = df.xs(p, drop_level=True)
if dfp.micros.count()==0: dfo.loc[p] += t + ' '
dfo.loc[dfo.missing_tasks != '']
Out[15]:
In [16]:
# Number of loopback midi signals per participant per task
# Slightly variations for improv tasks: possibly due to
# overflows (checkable in original CSV files)?
# T1_SMS_5 and T1_SMS_8 tasks were slightly longer for
# participants 010 through 015.
#(sms5: 150 vs 130; sms8: 140 vs 120)
# Should discard the additional intervals for these p's.
loopct = pd.DataFrame(index = pid_list)
for t in TASK_NAMES_USING:
loopct['lbcount_' + t] = np.nan
dft = dbase.xs(t, drop_level=True) #index: pid, stamp_type, csv_line
for p in pid_list:
dftp = dft.xs(p, drop_level=True)
task_lbcount = dftp.xs('loopback').micros.count()
loopct['lbcount_' + t].loc[p] = task_lbcount
loopct[::25]
#loopct.to_csv(....)
Out[16]:
In [17]:
#lbdata = dfo.drop('missing_tasks', axis=1)
data = loopct
for c in data.columns:
print(c)
if 'ISIP' in c:
print('skipping ISIP data\n')
continue
data_range = data[c].max() - data[c].min()
if data_range==0:
print("all data points = {0} \n\n".format(data[c].max()))
else:
plt.figure(figsize=(13,3))
data[c].hist(bins=data_range)
#annotating non-midpoint values
median = data[c].median()
prev_ypos = 0
for idx, value in enumerate(data[c]):
if value != median:
caption = str(data[c].index[idx]) + ": " + str(value)
plt.annotate(caption, (value, prev_ypos + 10))
prev_ypos += 10
#for i, txt in enumerate(n):
# ax.annotate(txt, (x[i],y[i]))
plt.show()
In [18]:
tapct = pd.DataFrame(index = pid_list)
for t in TASK_NAMES_USING:
tapct[t] = np.nan
dft = dbase.xs(t, drop_level=True) #index: pid, stamp_type, csv_line
for p in pid_list:
dftp = dft.xs(p, drop_level=True)
task_tapcount = dftp.xs('tap_r').micros.count()
tapct[t].loc[p] = task_tapcount
In [19]:
data = tapct
for c in data.columns:
print(c)
if 'ISIP' in c:
print('skipping ISIP data\n')
continue
data_range = data[c].max() - data[c].min()
if data_range==0:
print("all data points = {0} \n\n".format(data[c].max()))
else:
plt.figure(figsize=(13,3))
data[c].hist(bins=40)
#annotating non-midpoint values
caption_y_increment = 5
median = data[c].median()
prev_ypos = 0
for idx, value in enumerate(data[c]):
if np.abs(value - median) > 40:
caption = str(data[c].index[idx]) + ": " + str(value)
plt.annotate(caption, (value, prev_ypos + caption_y_increment))
prev_ypos += caption_y_increment
#for i, txt in enumerate(n):
# ax.annotate(txt, (x[i],y[i]))
plt.show()
#isip: Why such a large number of timestamps for some p's?
#ISIP_5: 094: 281, 055: 284, 041: 279, ...
#ISIP_8: 041: 288, 055: 280, 017: 217, ...
#...Right: these aren't filtered yet here!
In [136]:
def custom_histogram(data_series):
cleanintervals = data_series.dropna()
millis = cleanintervals.divide(1000)
millis_array = np.array(millis)
plt.title("Interval lengths")
plt.xlabel("Milliseconds")
plt.ylabel("Frequency")
plt.hist(millis_array,
bins=35,
#all defaults below
range=None,
normed=False,
weights=None,
cumulative=False,
bottom=None,
histtype='bar',
align='mid',
orientation='vertical',
rwidth=None,
log=False,
color=None,
label=None,
stacked=False,
hold=None,)
plt.gcf().set_size_inches(7, 7)
plt.show()
In [607]:
print("pre-filter length, maximum:")
print(len(taps.interval)) #before filter
print(max(taps.interval)); print()
tapsfilt = taps.copy(deep = True)
tapsfilt.loc[
tapsfilt['interval'] > 750000, # boolean selector on axis 0
'interval' # index value selector on axis 1
] = np.nan
print("post-filter length, maximum:")
print(len(tapsfilt.interval)) #after (includes NaN)
print(max(tapsfilt.interval))
custom_histogram(tapsfilt.interval)